In [137]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
In [138]:
#from google.colab import files #upload one file at a time (hours and days)
#uploaded = files.upload()
In [139]:
#ls
In [140]:
hour=pd.read_csv('hour.csv')
In [141]:
day=pd.read_csv('day.csv')
In [142]:
hour.head()
Out[142]:
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 1 0 1 0 0 6 0 1 0.24 0.2879 0.81 0.0 3 13 16
1 2 2011-01-01 1 0 1 1 0 6 0 1 0.22 0.2727 0.80 0.0 8 32 40
2 3 2011-01-01 1 0 1 2 0 6 0 1 0.22 0.2727 0.80 0.0 5 27 32
3 4 2011-01-01 1 0 1 3 0 6 0 1 0.24 0.2879 0.75 0.0 3 10 13
4 5 2011-01-01 1 0 1 4 0 6 0 1 0.24 0.2879 0.75 0.0 0 1 1
In [143]:
day.head()
Out[143]:
instant dteday season yr mnth holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 1 0 1 0 6 0 2 0.344167 0.363625 0.805833 0.160446 331 654 985
1 2 2011-01-02 1 0 1 0 0 0 2 0.363478 0.353739 0.696087 0.248539 131 670 801
2 3 2011-01-03 1 0 1 0 1 1 1 0.196364 0.189405 0.437273 0.248309 120 1229 1349
3 4 2011-01-04 1 0 1 0 2 1 1 0.200000 0.212122 0.590435 0.160296 108 1454 1562
4 5 2011-01-05 1 0 1 0 3 1 1 0.226957 0.229270 0.436957 0.186900 82 1518 1600
In [144]:
hour.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
instant       17379 non-null int64
dteday        17379 non-null object
season        17379 non-null int64
yr            17379 non-null int64
mnth          17379 non-null int64
hr            17379 non-null int64
holiday       17379 non-null int64
weekday       17379 non-null int64
workingday    17379 non-null int64
weathersit    17379 non-null int64
temp          17379 non-null float64
atemp         17379 non-null float64
hum           17379 non-null float64
windspeed     17379 non-null float64
casual        17379 non-null int64
registered    17379 non-null int64
cnt           17379 non-null int64
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB
In [145]:
hour.shape
Out[145]:
(17379, 17)
In [146]:
day.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
instant       731 non-null int64
dteday        731 non-null object
season        731 non-null int64
yr            731 non-null int64
mnth          731 non-null int64
holiday       731 non-null int64
weekday       731 non-null int64
workingday    731 non-null int64
weathersit    731 non-null int64
temp          731 non-null float64
atemp         731 non-null float64
hum           731 non-null float64
windspeed     731 non-null float64
casual        731 non-null int64
registered    731 non-null int64
cnt           731 non-null int64
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB
In [147]:
day.shape
Out[147]:
(731, 16)
In [148]:
hour.describe()
Out[148]:
instant season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
count 17379.0000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000 17379.000000
mean 8690.0000 2.501640 0.502561 6.537775 11.546752 0.028770 3.003683 0.682721 1.425283 0.496987 0.475775 0.627229 0.190098 35.676218 153.786869 189.463088
std 5017.0295 1.106918 0.500008 3.438776 6.914405 0.167165 2.005771 0.465431 0.639357 0.192556 0.171850 0.192930 0.122340 49.305030 151.357286 181.387599
min 1.0000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.020000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 4345.5000 2.000000 0.000000 4.000000 6.000000 0.000000 1.000000 0.000000 1.000000 0.340000 0.333300 0.480000 0.104500 4.000000 34.000000 40.000000
50% 8690.0000 3.000000 1.000000 7.000000 12.000000 0.000000 3.000000 1.000000 1.000000 0.500000 0.484800 0.630000 0.194000 17.000000 115.000000 142.000000
75% 13034.5000 3.000000 1.000000 10.000000 18.000000 0.000000 5.000000 1.000000 2.000000 0.660000 0.621200 0.780000 0.253700 48.000000 220.000000 281.000000
max 17379.0000 4.000000 1.000000 12.000000 23.000000 1.000000 6.000000 1.000000 4.000000 1.000000 1.000000 1.000000 0.850700 367.000000 886.000000 977.000000
In [149]:
day.describe()
Out[149]:
instant season yr mnth holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
count 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000 731.000000
mean 366.000000 2.496580 0.500684 6.519836 0.028728 2.997264 0.683995 1.395349 0.495385 0.474354 0.627894 0.190486 848.176471 3656.172367 4504.348837
std 211.165812 1.110807 0.500342 3.451913 0.167155 2.004787 0.465233 0.544894 0.183051 0.162961 0.142429 0.077498 686.622488 1560.256377 1937.211452
min 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.059130 0.079070 0.000000 0.022392 2.000000 20.000000 22.000000
25% 183.500000 2.000000 0.000000 4.000000 0.000000 1.000000 0.000000 1.000000 0.337083 0.337842 0.520000 0.134950 315.500000 2497.000000 3152.000000
50% 366.000000 3.000000 1.000000 7.000000 0.000000 3.000000 1.000000 1.000000 0.498333 0.486733 0.626667 0.180975 713.000000 3662.000000 4548.000000
75% 548.500000 3.000000 1.000000 10.000000 0.000000 5.000000 1.000000 2.000000 0.655417 0.608602 0.730209 0.233214 1096.000000 4776.500000 5956.000000
max 731.000000 4.000000 1.000000 12.000000 1.000000 6.000000 1.000000 3.000000 0.861667 0.840896 0.972500 0.507463 3410.000000 6946.000000 8714.000000
In [150]:
print(hour.isna().sum()/len(hour)*100)
instant       0.0
dteday        0.0
season        0.0
yr            0.0
mnth          0.0
hr            0.0
holiday       0.0
weekday       0.0
workingday    0.0
weathersit    0.0
temp          0.0
atemp         0.0
hum           0.0
windspeed     0.0
casual        0.0
registered    0.0
cnt           0.0
dtype: float64
In [151]:
print(day.isna().sum()/len(day)*100)
instant       0.0
dteday        0.0
season        0.0
yr            0.0
mnth          0.0
holiday       0.0
weekday       0.0
workingday    0.0
weathersit    0.0
temp          0.0
atemp         0.0
hum           0.0
windspeed     0.0
casual        0.0
registered    0.0
cnt           0.0
dtype: float64
In [152]:
fig, ax = plt.subplots()
hr_cnt = hour.groupby(['hr']).cnt.agg({'max','min','mean','sum'})
print (hr_cnt.sort_values(by=['sum'],ascending=False))
hr_cnt['sum'].plot()
ax.set_xlabel('Hour')
ax.set_ylabel('Count')
       sum  max        mean  min
hr                              
17  336860  976  461.452055   15
18  309772  977  425.510989   23
8   261001  839  359.011004    5
16  227748  783  311.983562   11
19  226789  743  311.523352   11
13  184919  760  253.661180   11
12  184414  776  253.315934    3
15  183149  750  251.233196    7
14  175652  750  240.949246   12
20  164550  567  226.030220   11
9   159438  426  219.309491   14
7   154171  596  212.064649    1
11  151320  663  208.143054   10
10  126257  539  173.668501    8
21  125445  584  172.314560    6
22   95612  502  131.335165    9
23   63941  256   87.831044    2
6    55132  213   76.044138    1
0    39130  283   53.898072    2
1    24164  168   33.375691    1
2    16352  132   22.869930    1
5    14261   66   19.889819    1
3     8174   79   11.727403    1
4     4428   28    6.352941    1
Out[152]:
Text(0, 0.5, 'Count')
In [153]:
hr_mnth = hour.groupby(['mnth']).cnt.agg({'max','min','mean','sum'})
print (hr_mnth.sort_values(by=['sum'],ascending=False))
hr_mnth['sum'].plot()
         sum  max        mean  min
mnth                              
8     351194  941  238.097627    1
6     346342  900  240.515278    1
9     345991  977  240.773138    1
7     344948  913  231.819892    1
5     331686  873  222.907258    1
10    322352  963  222.158511    1
4     269094  822  187.260960    1
11    254831  729  177.335421    1
3     228920  957  155.410726    1
12    211036  759  142.303439    1
2     151352  610  112.865026    1
1     134933  559   94.424773    1
Out[153]:
<matplotlib.axes._subplots.AxesSubplot at 0x29ab9645dc8>
In [154]:
hr_weekday = hour.groupby(['weekday'])['cnt'].sum()
print (hr_weekday.sort_values(ascending=False))
hr_weekday.plot()
weekday
5    487790
4    485395
6    477807
3    473048
2    469109
1    455503
0    444027
Name: cnt, dtype: int64
Out[154]:
<matplotlib.axes._subplots.AxesSubplot at 0x29ab96c5048>
In [155]:
#season (1:winter, 2:spring, 3:summer, 4:fall)
hr_season = hour.groupby(['season'])['cnt'].sum()
print (hr_season.sort_values(ascending=False))
hr_season.plot()
season
3    1061129
2     918589
4     841613
1     471348
Name: cnt, dtype: int64
Out[155]:
<matplotlib.axes._subplots.AxesSubplot at 0x29ab9725488>
In [156]:
hour.iloc[hour['cnt'].idxmax()]
Out[156]:
instant            14774
dteday        2012-09-12
season                 3
yr                     1
mnth                   9
hr                    18
holiday                0
weekday                3
workingday             1
weathersit             1
temp                0.66
atemp             0.6212
hum                 0.44
windspeed         0.2537
casual                91
registered           886
cnt                  977
Name: 14773, dtype: object
In [157]:
day.iloc[day['cnt'].idxmax()]
Out[157]:
instant              624
dteday        2012-09-15
season                 3
yr                     1
mnth                   9
holiday                0
weekday                6
workingday             0
weathersit             1
temp            0.608333
atemp           0.585867
hum             0.501667
windspeed       0.247521
casual              3160
registered          5554
cnt                 8714
Name: 623, dtype: object
In [158]:
hour[hour['dteday']=='2012-09-15']
Out[158]:
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
14827 14828 2012-09-15 3 1 9 0 0 6 0 1 0.60 0.5606 0.83 0.1045 38 169 207
14828 14829 2012-09-15 3 1 9 1 0 6 0 1 0.60 0.5909 0.73 0.0000 8 101 109
14829 14830 2012-09-15 3 1 9 2 0 6 0 1 0.58 0.5455 0.78 0.1045 18 75 93
14830 14831 2012-09-15 3 1 9 3 0 6 0 1 0.60 0.5909 0.73 0.2537 6 31 37
14831 14832 2012-09-15 3 1 9 4 0 6 0 2 0.60 0.5909 0.69 0.3582 3 3 6
14832 14833 2012-09-15 3 1 9 5 0 6 0 1 0.58 0.5455 0.60 0.5224 1 15 16
14833 14834 2012-09-15 3 1 9 6 0 6 0 1 0.54 0.5152 0.49 0.4179 6 27 33
14834 14835 2012-09-15 3 1 9 7 0 6 0 1 0.54 0.5152 0.52 0.2836 10 63 73
14835 14836 2012-09-15 3 1 9 8 0 6 0 1 0.56 0.5303 0.49 0.4179 43 169 212
14836 14837 2012-09-15 3 1 9 9 0 6 0 1 0.60 0.6212 0.43 0.4179 79 263 342
14837 14838 2012-09-15 3 1 9 10 0 6 0 1 0.62 0.6212 0.41 0.3881 119 323 442
14838 14839 2012-09-15 3 1 9 11 0 6 0 1 0.64 0.6212 0.38 0.3881 228 399 627
14839 14840 2012-09-15 3 1 9 12 0 6 0 1 0.66 0.6212 0.36 0.3582 287 419 706
14840 14841 2012-09-15 3 1 9 13 0 6 0 1 0.68 0.6364 0.36 0.1940 327 377 704
14841 14842 2012-09-15 3 1 9 14 0 6 0 1 0.68 0.6364 0.34 0.3284 325 390 715
14842 14843 2012-09-15 3 1 9 15 0 6 0 2 0.68 0.6364 0.34 0.2836 312 342 654
14843 14844 2012-09-15 3 1 9 16 0 6 0 2 0.66 0.6212 0.36 0.2239 350 433 783
14844 14845 2012-09-15 3 1 9 17 0 6 0 2 0.66 0.6212 0.36 0.2537 295 434 729
14845 14846 2012-09-15 3 1 9 18 0 6 0 2 0.64 0.6212 0.36 0.2836 232 382 614
14846 14847 2012-09-15 3 1 9 19 0 6 0 1 0.62 0.6212 0.41 0.1642 169 309 478
14847 14848 2012-09-15 3 1 9 20 0 6 0 1 0.60 0.6212 0.43 0.0896 89 241 330
14848 14849 2012-09-15 3 1 9 21 0 6 0 1 0.56 0.5303 0.52 0.1045 86 210 296
14849 14850 2012-09-15 3 1 9 22 0 6 0 1 0.56 0.5303 0.52 0.0000 82 197 279
14850 14851 2012-09-15 3 1 9 23 0 6 0 1 0.54 0.5152 0.60 0.0000 47 182 229
In [159]:
round(hour['registered'].sum()/hour['cnt'].sum()*100,2)
Out[159]:
81.17
In [160]:
round(hour['casual'].sum()/hour['cnt'].sum()*100,2)
Out[160]:
18.83

denormalize (Temp, Windspeed, and hum)

In [161]:
#The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)

hour['temp'] = (hour['temp']*47)-8
print (hour['temp'].min())
print(hour['temp'].max())
-7.06
39.0
In [162]:
#atemp: Normalized feeling temperature in Celsius. The values are derived 
#via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
hour['atemp'] = (hour['atemp']*66)-16
print (hour['atemp'].min())
print(hour['atemp'].max())
-16.0
50.0
In [163]:
#Hum*100
hour['hum']=hour['hum']*100
#Windspeed *76
hour['windspeed']=hour['windspeed']*76
In [164]:
#Droping unused columns
#Holiday and Working day are the sam we will drop Holiday
#Casual and registered will not be used in this stude
hour.drop(['holiday','casual','registered'],axis=1,inplace=True)
In [ ]:
 
In [165]:
#coorelation
corr = hour.corr()
print (corr)
             instant    season        yr      mnth        hr   weekday  \
instant     1.000000  0.404046  0.866014  0.489164 -0.004775  0.001357   
season      0.404046  1.000000 -0.010742  0.830386 -0.006117 -0.002335   
yr          0.866014 -0.010742  1.000000 -0.010473 -0.003867 -0.004485   
mnth        0.489164  0.830386 -0.010473  1.000000 -0.005772  0.010400   
hr         -0.004775 -0.006117 -0.003867 -0.005772  1.000000 -0.003498   
weekday     0.001357 -0.002335 -0.004485  0.010400 -0.003498  1.000000   
workingday -0.003416  0.013743 -0.002196 -0.003477  0.002285  0.035955   
weathersit -0.014198 -0.014524 -0.019157  0.005400 -0.020203  0.003311   
temp        0.136178  0.312025  0.040913  0.201691  0.137603 -0.001795   
atemp       0.137615  0.319380  0.039222  0.208096  0.133750 -0.008821   
hum         0.009577  0.150625 -0.083546  0.164411 -0.276498 -0.037158   
windspeed  -0.074505 -0.149773 -0.008740 -0.135386  0.137252  0.011502   
cnt         0.278379  0.178056  0.250495  0.120638  0.394071  0.026900   

            workingday  weathersit      temp     atemp       hum  windspeed  \
instant      -0.003416   -0.014198  0.136178  0.137615  0.009577  -0.074505   
season        0.013743   -0.014524  0.312025  0.319380  0.150625  -0.149773   
yr           -0.002196   -0.019157  0.040913  0.039222 -0.083546  -0.008740   
mnth         -0.003477    0.005400  0.201691  0.208096  0.164411  -0.135386   
hr            0.002285   -0.020203  0.137603  0.133750 -0.276498   0.137252   
weekday       0.035955    0.003311 -0.001795 -0.008821 -0.037158   0.011502   
workingday    1.000000    0.044672  0.055390  0.054667  0.015688  -0.011830   
weathersit    0.044672    1.000000 -0.102640 -0.105563  0.418130   0.026226   
temp          0.055390   -0.102640  1.000000  0.987672 -0.069881  -0.023125   
atemp         0.054667   -0.105563  0.987672  1.000000 -0.051918  -0.062336   
hum           0.015688    0.418130 -0.069881 -0.051918  1.000000  -0.290105   
windspeed    -0.011830    0.026226 -0.023125 -0.062336 -0.290105   1.000000   
cnt           0.030284   -0.142426  0.404772  0.400929 -0.322911   0.093234   

                 cnt  
instant     0.278379  
season      0.178056  
yr          0.250495  
mnth        0.120638  
hr          0.394071  
weekday     0.026900  
workingday  0.030284  
weathersit -0.142426  
temp        0.404772  
atemp       0.400929  
hum        -0.322911  
windspeed   0.093234  
cnt         1.000000  
In [166]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
mask = np.triu(np.ones_like(corr, dtype=np.bool))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Dataset featurs' Correlation  ", fontsize =10)
Out[166]:
Text(0.5, 1, "Dataset featurs' Correlation  ")
In [167]:
fig,ax = plt.subplots()
fig.set_size_inches(18, 8)
sns.pointplot(data=hour[['hr','cnt','season']],
              x='hr',
              y='cnt',
              hue='season',
              ax=ax)
ax.set(title="Season - hourly distribution of counts",xlabel='Hour',ylabel='Total Count')
Out[167]:
[Text(0, 0.5, 'Total Count'),
 Text(0.5, 0, 'Hour'),
 Text(0.5, 1.0, 'Season - hourly distribution of counts')]
In [168]:
fig,ax = plt.subplots()
fig.set_size_inches(18, 8)
sns.pointplot(data=hour[['hr',
                           'cnt',
                           'weekday']],
              x='hr',
              y='cnt',
              hue='weekday',
              ax=ax)
ax.set(title="Weekday - hourly distribution of counts",xlabel='Hour',ylabel='Total Count')
Out[168]:
[Text(0, 0.5, 'Total Count'),
 Text(0.5, 0, 'Hour'),
 Text(0.5, 1.0, 'Weekday - hourly distribution of counts')]
In [169]:
#Checking for outliners
In [170]:
fig,(ax1,ax2) = plt.subplots(ncols=2)
fig.set_size_inches(18, 8)
sns.boxplot(data=hour['cnt'],ax=ax1)
sns.boxplot(data=hour[['temp','windspeed']],ax=ax2)
Out[170]:
<matplotlib.axes._subplots.AxesSubplot at 0x29ac0d1d248>
In [172]:
hour.columns
Out[172]:
Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')
In [173]:
hour.index=hour['dteday']

Z-Score-

Wikipedia Definition The Z-score is the signed number of standard deviations by which the value of an observation or data point is above the mean value of what is being observed or measured. The intuition behind Z-score is to describe any data point by finding their relationship with the Standard Deviation and Mean of the group of data points. Z-score is finding the distribution of data where mean is 0 and standard deviation is 1 i.e. normal distribution. You must be wondering that, how does this help in identifying the outliers? Well, while calculating the Z-score we re-scale and center the data and look for data points which are too far from zero. These data points which are way too far from zero will be treated as the outliers. In most of the cases a threshold of 3 or -3 is used i.e if the Z-score value is greater than or less than 3 or -3 respectively, that data point will be identified as outliers. We will use Z-score function defined in scipy library to detect the outliers.

In [174]:
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(hour))
print(z)
hour_o = hour[(z < 3).all(axis=1)]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-174-37e23f4fd2b1> in <module>
      1 from scipy import stats
      2 import numpy as np
----> 3 z = np.abs(stats.zscore(hour))
      4 print(z)
      5 hour_o = hour[(z < 3).all(axis=1)]

~\AppData\Local\Continuum\anaconda3\lib\site-packages\scipy\stats\stats.py in zscore(a, axis, ddof)
   2307     """
   2308     a = np.asanyarray(a)
-> 2309     mns = a.mean(axis=axis)
   2310     sstd = a.std(axis=axis, ddof=ddof)
   2311     if axis and mns.ndim < a.ndim:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\core\_methods.py in _mean(a, axis, dtype, out, keepdims)
     76     if isinstance(ret, mu.ndarray):
     77         ret = um.true_divide(
---> 78                 ret, rcount, out=ret, casting='unsafe', subok=False)
     79         if is_float16_result and out is None:
     80             ret = arr.dtype.type(ret)

TypeError: unsupported operand type(s) for /: 'str' and 'int'
In [175]:
#lets check the shape

print ('Original Dataset',hour.shape,'\n','After removing outliers ',hour_o.shape,'\n','Difference is ',hour.shape[0]-hour_o.shape[0])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-175-2f68c2b05050> in <module>
      1 #lets check the shape
      2 
----> 3 print ('Original Dataset',hour.shape,'\n','After removing outliers ',hour_o.shape,'\n','Difference is ',hour.shape[0]-hour_o.shape[0])

NameError: name 'hour_o' is not defined
In [176]:
fig,ax = plt.subplots()
fig.set_size_inches(18, 8)
sns.boxplot(data=hour[['cnt',
                          'hr']],x='hr',y='cnt',ax=ax)
ax.set(title="Checking for outliners in day hours",xlabel='Hour',ylabel='Total Count')
Out[176]:
[Text(0, 0.5, 'Total Count'),
 Text(0.5, 0, 'Hour'),
 Text(0.5, 1.0, 'Checking for outliners in day hours')]
In [177]:
sns.lmplot('temp','cnt',row='workingday',col='season',data=day,palette='RdBu_r',fit_reg=True)
Out[177]:
<seaborn.axisgrid.FacetGrid at 0x29ac0e7e108>
In [178]:
G1= hour
G1['workingday'] = np.where(G1['workingday'] == '0', 'Not Working Day', G1['workingday'])
G1['workingday'] = np.where(G1['workingday'] =='1', 'Working Day', G1['workingday'])
g = sns.catplot(x="hr", y="cnt",
                hue="workingday", col="season",
                data=G1, kind="bar",
                height=10, aspect=1)
C:\Users\AbdulsalamFawzi\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops\__init__.py:1115: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  result = method(y)
In [179]:
G2= hour
G2['holiday'] = np.where(G1['holiday'] == '0', 'Not a Holiday', G2['holiday'])
G2['holiday'] = np.where(G1['holiday'] =='1', 'Holiday', G2['holiday'])
g = sns.catplot(x="hr", y="cnt",
                hue="holiday", col="season",
                data=G2, kind="bar",
                height=10, aspect=1)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'holiday'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-179-3403a25c68f6> in <module>
      1 G2= hour
----> 2 G2['holiday'] = np.where(G1['holiday'] == '0', 'Not a Holiday', G2['holiday'])
      3 G2['holiday'] = np.where(G1['holiday'] =='1', 'Holiday', G2['holiday'])
      4 g = sns.catplot(x="hr", y="cnt",
      5                 hue="holiday", col="season",

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2978             if self.columns.nlevels > 1:
   2979                 return self._getitem_multilevel(key)
-> 2980             indexer = self.columns.get_loc(key)
   2981             if is_integer(indexer):
   2982                 indexer = [indexer]

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'holiday'
In [ ]:
 
In [ ]:
#not needed for the project
hour.drop('instant',axis=1,inplace=True)
plt.figure(figsize=(20,5))
mask = np.zeros_like(hour.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(hour.corr(),cmap='RdBu_r',mask=mask, annot=True)

Data Transformation

In [193]:
MOD_READY = hour
In [194]:
MOD_READY.columns
Out[194]:
Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')
In [195]:
print(MOD_READY.columns)
MOD_READY.index= MOD_READY['dteday']
Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')
In [196]:
del MOD_READY['dteday']
In [197]:
MOD_READY = pd.get_dummies(MOD_READY, columns=['hr','season','workingday','weathersit'],drop_first=False)
In [198]:
MOD_READY.info()
<class 'pandas.core.frame.DataFrame'>
Index: 17379 entries, 2011-01-01 to 2012-12-31
Data columns (total 43 columns):
instant                   17379 non-null int64
yr                        17379 non-null int64
mnth                      17379 non-null int64
weekday                   17379 non-null int64
temp                      17379 non-null float64
atemp                     17379 non-null float64
hum                       17379 non-null float64
windspeed                 17379 non-null float64
cnt                       17379 non-null int64
hr_0                      17379 non-null uint8
hr_1                      17379 non-null uint8
hr_2                      17379 non-null uint8
hr_3                      17379 non-null uint8
hr_4                      17379 non-null uint8
hr_5                      17379 non-null uint8
hr_6                      17379 non-null uint8
hr_7                      17379 non-null uint8
hr_8                      17379 non-null uint8
hr_9                      17379 non-null uint8
hr_10                     17379 non-null uint8
hr_11                     17379 non-null uint8
hr_12                     17379 non-null uint8
hr_13                     17379 non-null uint8
hr_14                     17379 non-null uint8
hr_15                     17379 non-null uint8
hr_16                     17379 non-null uint8
hr_17                     17379 non-null uint8
hr_18                     17379 non-null uint8
hr_19                     17379 non-null uint8
hr_20                     17379 non-null uint8
hr_21                     17379 non-null uint8
hr_22                     17379 non-null uint8
hr_23                     17379 non-null uint8
season_1                  17379 non-null uint8
season_2                  17379 non-null uint8
season_3                  17379 non-null uint8
season_4                  17379 non-null uint8
workingday_0              17379 non-null uint8
workingday_Working Day    17379 non-null uint8
weathersit_1              17379 non-null uint8
weathersit_2              17379 non-null uint8
weathersit_3              17379 non-null uint8
weathersit_4              17379 non-null uint8
dtypes: float64(4), int64(5), uint8(34)
memory usage: 1.9+ MB
In [199]:
MOD_READY
Out[199]:
instant yr mnth weekday temp atemp hum windspeed cnt hr_0 ... season_1 season_2 season_3 season_4 workingday_0 workingday_Working Day weathersit_1 weathersit_2 weathersit_3 weathersit_4
dteday
2011-01-01 1 0 1 6 3.28 3.0014 81.0 0.0000 16 1 ... 1 0 0 0 1 0 1 0 0 0
2011-01-01 2 0 1 6 2.34 1.9982 80.0 0.0000 40 0 ... 1 0 0 0 1 0 1 0 0 0
2011-01-01 3 0 1 6 2.34 1.9982 80.0 0.0000 32 0 ... 1 0 0 0 1 0 1 0 0 0
2011-01-01 4 0 1 6 3.28 3.0014 75.0 0.0000 13 0 ... 1 0 0 0 1 0 1 0 0 0
2011-01-01 5 0 1 6 3.28 3.0014 75.0 0.0000 1 0 ... 1 0 0 0 1 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2012-12-31 17375 1 12 1 4.22 1.0016 60.0 12.4792 119 0 ... 1 0 0 0 0 1 0 1 0 0
2012-12-31 17376 1 12 1 4.22 1.0016 60.0 12.4792 89 0 ... 1 0 0 0 0 1 0 1 0 0
2012-12-31 17377 1 12 1 4.22 1.0016 60.0 12.4792 90 0 ... 1 0 0 0 0 1 1 0 0 0
2012-12-31 17378 1 12 1 4.22 1.9982 56.0 10.2068 61 0 ... 1 0 0 0 0 1 1 0 0 0
2012-12-31 17379 1 12 1 4.22 1.9982 65.0 10.2068 49 0 ... 1 0 0 0 0 1 1 0 0 0

17379 rows × 43 columns

In [200]:
MOD_READY_CORR = MOD_READY.corr()
In [201]:
MOD_READY_CORR['cnt']
Out[201]:
instant                   0.278379
yr                        0.250495
mnth                      0.120638
weekday                   0.026900
temp                      0.404772
atemp                     0.400929
hum                      -0.322911
windspeed                 0.093234
cnt                       1.000000
hr_0                     -0.156054
hr_1                     -0.179420
hr_2                     -0.190250
hr_3                     -0.200296
hr_4                     -0.206352
hr_5                     -0.193936
hr_6                     -0.130467
hr_7                      0.026036
hr_8                      0.195313
hr_9                      0.034382
hr_10                    -0.018195
hr_11                     0.021519
hr_12                     0.073609
hr_13                     0.074060
hr_14                     0.059395
hr_15                     0.071259
hr_16                     0.141443
hr_17                     0.313996
hr_18                     0.272114
hr_19                     0.140710
hr_20                     0.042154
hr_21                    -0.019769
hr_22                    -0.067009
hr_23                    -0.117160
season_1                 -0.245456
season_2                  0.060692
season_3                  0.151621
season_4                  0.029421
workingday_0             -0.030284
workingday_Working Day    0.030284
weathersit_1              0.117478
weathersit_2             -0.046902
weathersit_3             -0.128034
weathersit_4             -0.008340
Name: cnt, dtype: float64
In [202]:
print(*MOD_READY.columns)
instant yr mnth weekday temp atemp hum windspeed cnt hr_0 hr_1 hr_2 hr_3 hr_4 hr_5 hr_6 hr_7 hr_8 hr_9 hr_10 hr_11 hr_12 hr_13 hr_14 hr_15 hr_16 hr_17 hr_18 hr_19 hr_20 hr_21 hr_22 hr_23 season_1 season_2 season_3 season_4 workingday_0 workingday_Working Day weathersit_1 weathersit_2 weathersit_3 weathersit_4
In [203]:
MOD_READY.rename(columns={'workingday_Working Day':'workingday_1'}, inplace=True)
In [204]:
MOD_READY.mean()
Out[204]:
instant         8690.000000
yr                 0.502561
mnth               6.537775
weekday            3.003683
temp              15.358397
atemp             15.401157
hum               62.722884
windspeed         14.447418
cnt              189.463088
hr_0               0.041775
hr_1               0.041659
hr_2               0.041142
hr_3               0.040106
hr_4               0.040106
hr_5               0.041257
hr_6               0.041717
hr_7               0.041832
hr_8               0.041832
hr_9               0.041832
hr_10              0.041832
hr_11              0.041832
hr_12              0.041890
hr_13              0.041947
hr_14              0.041947
hr_15              0.041947
hr_16              0.042005
hr_17              0.042005
hr_18              0.041890
hr_19              0.041890
hr_20              0.041890
hr_21              0.041890
hr_22              0.041890
hr_23              0.041890
season_1           0.244088
season_2           0.253697
season_3           0.258703
season_4           0.243512
workingday_0       0.317279
workingday_1       0.682721
weathersit_1       0.656712
weathersit_2       0.261465
weathersit_3       0.081650
weathersit_4       0.000173
dtype: float64
In [206]:
### STATSMODELS ###
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection  import train_test_split
import numpy as np
# create a fitted model
ML = smf.ols(formula='cnt ~ hr_0 + hr_1 +hr_2+ hr_3+ hr_4 +hr_5 +hr_6+ hr_7+ hr_8+ hr_9+ hr_10+ hr_11+ hr_12+ hr_13+ hr_14+ hr_15+ hr_16+ hr_17 +hr_18 +hr_19+ hr_20+ hr_21+ hr_22+ hr_23 + season_1 + season_2 + season_3 + season_4 + workingday_0 + workingday_1 + weathersit_1 + weathersit_2 + weathersit_3+ weathersit_4 + temp + atemp + hum + windspeed', data=MOD_READY).fit()

# print the coefficients
ML.params
Out[206]:
Intercept        76.194648
hr_0           -118.837329
hr_1           -135.642941
hr_2           -143.778880
hr_3           -153.761950
hr_4           -155.838462
hr_5           -139.676707
hr_6            -81.197322
hr_7             53.148652
hr_8            192.417976
hr_9             43.023614
hr_10           -13.426535
hr_11            10.196858
hr_12            48.017778
hr_13            42.087134
hr_14            25.658775
hr_15            34.936660
hr_16            97.332619
hr_17           251.805368
hr_18           220.882231
hr_19           113.662233
hr_20            35.246303
hr_21           -13.196061
hr_22           -49.335397
hr_23           -87.529968
season_1         -9.786496
season_2         26.132728
season_3          7.219388
season_4         52.629029
workingday_0     34.835760
workingday_1     41.358888
weathersit_1     44.878897
weathersit_2     38.848762
weathersit_3    -15.911694
weathersit_4      8.378682
temp              4.049163
atemp             1.438659
hum              -1.020891
windspeed        -0.527336
dtype: float64
In [207]:
print(ML.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    cnt   R-squared:                       0.626
Model:                            OLS   Adj. R-squared:                  0.626
Method:                 Least Squares   F-statistic:                     855.3
Date:                Tue, 14 Apr 2020   Prob (F-statistic):               0.00
Time:                        20:10:19   Log-Likelihood:            -1.0649e+05
No. Observations:               17379   AIC:                         2.130e+05
Df Residuals:                   17344   BIC:                         2.133e+05
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       76.1946      8.361      9.113      0.000      59.807      92.582
hr_0          -118.8373      4.079    -29.133      0.000    -126.833    -110.842
hr_1          -135.6429      4.091    -33.156      0.000    -143.662    -127.624
hr_2          -143.7789      4.134    -34.780      0.000    -151.882    -135.676
hr_3          -153.7619      4.195    -36.651      0.000    -161.985    -145.539
hr_4          -155.8385      4.210    -37.013      0.000    -164.091    -147.586
hr_5          -139.6767      4.165    -33.536      0.000    -147.840    -131.513
hr_6           -81.1973      4.143    -19.600      0.000     -89.317     -73.077
hr_7            53.1487      4.114     12.919      0.000      45.085      61.212
hr_8           192.4180      4.084     47.120      0.000     184.414     200.422
hr_9            43.0236      4.056     10.607      0.000      35.073      50.974
hr_10          -13.4265      4.048     -3.317      0.001     -21.361      -5.492
hr_11           10.1969      4.062      2.510      0.012       2.234      18.159
hr_12           48.0178      4.089     11.743      0.000      40.003      56.033
hr_13           42.0871      4.114     10.230      0.000      34.023      50.152
hr_14           25.6588      4.139      6.200      0.000      17.547      33.771
hr_15           34.9367      4.147      8.425      0.000      26.808      43.065
hr_16           97.3326      4.131     23.563      0.000      89.236     105.429
hr_17          251.8054      4.113     61.225      0.000     243.744     259.867
hr_18          220.8822      4.082     54.111      0.000     212.881     228.883
hr_19          113.6622      4.057     28.013      0.000     105.709     121.615
hr_20           35.2463      4.046      8.711      0.000      27.315      43.178
hr_21          -13.1961      4.045     -3.262      0.001     -21.125      -5.267
hr_22          -49.3354      4.051    -12.177      0.000     -57.277     -41.394
hr_23          -87.5300      4.060    -21.560      0.000     -95.488     -79.572
season_1        -9.7865      2.772     -3.530      0.000     -15.221      -4.352
season_2        26.1327      2.624      9.958      0.000      20.989      31.276
season_3         7.2194      3.205      2.252      0.024       0.937      13.502
season_4        52.6290      2.596     20.271      0.000      47.540      57.718
workingday_0    34.8358      4.293      8.114      0.000      26.420      43.251
workingday_1    41.3589      4.262      9.704      0.000      33.005      49.713
weathersit_1    44.8789     14.138      3.174      0.002      17.167      72.591
weathersit_2    38.8488     14.158      2.744      0.006      11.098      66.600
weathersit_3   -15.9117     14.309     -1.112      0.266     -43.958      12.135
weathersit_4     8.3787     50.101      0.167      0.867     -89.825     106.582
temp             4.0492      0.648      6.252      0.000       2.780       5.319
atemp            1.4387      0.498      2.888      0.004       0.462       2.415
hum             -1.0209      0.059    -17.423      0.000      -1.136      -0.906
windspeed       -0.5273      0.100     -5.261      0.000      -0.724      -0.331
==============================================================================
Omnibus:                     1956.109   Durbin-Watson:                   0.425
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             4284.404
Skew:                           0.695   Prob(JB):                         0.00
Kurtosis:                       4.997   Cond. No.                     7.24e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.63e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [210]:
ML.predict({'hr_0':[0],'hr_1':[0], 'hr_2':[0],'hr_3':[0], 'hr_4':[0], 'hr_5':[0], 'hr_6':[0], 'hr_7':[0], 'hr_8':[0], 'hr_9':[0], 'hr_10':[0],
       'hr_11':[0], 'hr_12':[1], 'hr_13':[0], 'hr_14':[0], 'hr_15':[0], 'hr_16':[0], 'hr_17':[0], 'hr_18':[0],
       'hr_19':[0], 'hr_20':[0], 'hr_21':[0], 'hr_22':[0], 'hr_23':[0],'season_1':[0],'season_2':[1],'season_3':[0],'season_4':[0]
        ,'workingday_0':[1],'workingday_1':[0],'weathersit_1':[1],'weathersit_2':[0],'weathersit_3':[0],'weathersit_4':[0],'temp':[15.35],'atemp':[15.4] , 'hum':[62.7] , 'windspeed':[14.44]   })
Out[210]:
0    242.745185
dtype: float64
In [212]:
# create X and y
feature_cols = [ 'temp','hr_0','hr_1', 'hr_2',
       'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10',
       'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18',
       'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23','season_1' ,'season_2', 'season_3',
       'season_4', 'workingday_0','workingday_1', 'weathersit_1','weathersit_2', 'weathersit_3','atemp', 'hum' , 'windspeed',
       'weathersit_4']
X_ML2 = MOD_READY[feature_cols]
y_ML2 = MOD_READY.cnt

# instantiate and fit
ML2 = LinearRegression()
ML2.fit(X_ML2, y_ML2)

# print the coefficients
print (ML2.intercept_)
print (ML2.coef_)
308504244038.9033
[ 4.04785959e+00 -3.03107596e+11 -3.03107596e+11 -3.03107596e+11
 -3.03107596e+11 -3.03107596e+11 -3.03107596e+11 -3.03107595e+11
 -3.03107595e+11 -3.03107595e+11 -3.03107595e+11 -3.03107595e+11
 -3.03107595e+11 -3.03107595e+11 -3.03107595e+11 -3.03107595e+11
 -3.03107595e+11 -3.03107595e+11 -3.03107595e+11 -3.03107595e+11
 -3.03107595e+11 -3.03107595e+11 -3.03107595e+11 -3.03107595e+11
 -3.03107595e+11 -3.11200057e+09 -3.11200053e+09 -3.11200055e+09
 -3.11200051e+09 -2.61693350e+09 -2.61693350e+09  3.32285589e+08
  3.32285583e+08  3.32285528e+08  1.43988329e+00 -1.02082654e+00
 -5.27214822e-01  3.32285552e+08]
In [213]:
### STATSMODELS ###

# you have to create a DataFrame since the Statsmodels formula interface expects it
X_new = pd.DataFrame({ 'temp':[0.26],'hr_0':[0],'hr_1':[0], 'hr_2':[0],'hr_3':[0], 'hr_4':[0], 'hr_5':[0], 'hr_6':[0], 'hr_7':[0], 'hr_8':[0], 'hr_9':[0], 'hr_10':[0],
       'hr_11':[0], 'hr_12':[1], 'hr_13':[0], 'hr_14':[0], 'hr_15':[0], 'hr_16':[0], 'hr_17':[0], 'hr_18':[0],
       'hr_19':[0], 'hr_20':[0], 'hr_21':[0], 'hr_22':[0], 'hr_23':[0],'season_1':[0] ,'season_2':[1], 'season_3':[0],
       'season_4':[0], 'workingday_0':[1],'workingday_1':[0], 'weathersit_1':[1],'weathersit_2':[0], 'weathersit_3':[0],'weathersit_4':[0],'atemp':[15.4] , 'hum':[62.7] , 'windspeed':[14.44]})

# predict for a new observation
ML2.predict(X_new)
Out[213]:
array([4.79820356e+09])
In [214]:
x_ML2 = ML2.predict(X_ML2)
In [215]:
MOD_READY['cnt_predect']=x_ML2
In [216]:
MOD_READY.tail()
Out[216]:
instant yr mnth weekday temp atemp hum windspeed cnt hr_0 ... season_2 season_3 season_4 workingday_0 workingday_1 weathersit_1 weathersit_2 weathersit_3 weathersit_4 cnt_predect
dteday
2012-12-31 17375 1 12 1 4.22 1.0016 60.0 12.4792 119 0 ... 0 0 0 0 1 0 1 0 0 210.970093
2012-12-31 17376 1 12 1 4.22 1.0016 60.0 12.4792 89 0 ... 0 0 0 0 1 0 1 0 0 132.554504
2012-12-31 17377 1 12 1 4.22 1.0016 60.0 12.4792 90 0 ... 0 0 0 0 1 1 0 0 0 90.139343
2012-12-31 17378 1 12 1 4.22 1.9982 56.0 10.2068 61 0 ... 0 0 0 0 1 1 0 0 0 60.716614
2012-12-31 17379 1 12 1 4.22 1.9982 65.0 10.2068 49 0 ... 0 0 0 0 1 1 0 0 0 13.334778

5 rows × 44 columns

In [217]:
Model_Plot = MOD_READY.pivot_table(index=['mnth','hr_12'] , margins=False ,values=['cnt',  'cnt_predect'],aggfunc=np.mean)
Model_Plot.plot(figsize=(20,8), title="Prediction Vs Actual", grid=True)
plt.ylabel('Data')
Out[217]:
Text(0, 0.5, 'Data')
In [220]:
#using train test split 
feature_cols = ['hr_0','hr_1', 'hr_2',
       'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10',
       'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18',
       'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23','season_1', 'season_2', 'season_3',
       'season_4', 'workingday_1', 'weathersit_1','weathersit_2', 'weathersit_3','atemp', 'hum', 'windspeed',
       'weathersit_4']
X = MOD_READY[feature_cols]
y = MOD_READY.cnt
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)
(13903, 36) (13903,)
(3476, 36) (3476,)
In [221]:
# fit a model
from sklearn.metrics import mean_squared_error
from math import sqrt
lm = linear_model.LinearRegression()
model_FIT = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
In [222]:
print ('Score:', model_FIT.score(X_test, y_test))
y_pred = lm.predict(X_test)
print('RMSE: %.2f' % sqrt(mean_squared_error(y_test, y_pred)))
Score: 0.6344355793161516
RMSE: 111.11

without feature Engineering

In [223]:
feature_cols = ['hr','season']
X_N = hour[feature_cols]
y_N = hour.cnt
# create training and testing vars
X_train_N, X_test_N, y_train_N, y_test_N = train_test_split(X_N, y_N, test_size=0.2)
print (X_train_N.shape, y_train_N.shape)
print (X_test_N.shape, y_test_N.shape)
(13903, 2) (13903,)
(3476, 2) (3476,)
In [224]:
lm_N = linear_model.LinearRegression()
model_N = lm_N.fit(X_train_N, y_train_N)
predictions_N = lm_N.predict(X_test_N)
In [225]:
print ('Score:', model_N.score(X_test_N, y_test_N))
Score: 0.17872167427694863
In [226]:
print ('Score:', model_N.score(X_train_N, y_train_N))
y_pred_N = model_N.predict(X_test_N)
print('RMSE: %.2f' % sqrt(mean_squared_error(y_test_N, y_pred_N)))
Score: 0.18988026408634662
RMSE: 160.48
In [ ]:
 

data transforamtion / scaling (Hour Column)

In [227]:
scaled_hour=hour
#df.index = df['dteday']
In [228]:
#Normalizing the hour columns
from sklearn.preprocessing import StandardScaler
cols_to_norm = ['hr']
scaled_hour[cols_to_norm] = StandardScaler().fit_transform(scaled_hour[cols_to_norm])
In [229]:
#Feature Engineering for the categorical columns 
scaled_hour = pd.get_dummies(scaled_hour, columns=['season','workingday','weathersit'],drop_first=False)
In [230]:
scaled_hour.head()
Out[230]:
instant yr mnth hr weekday temp atemp hum windspeed cnt season_1 season_2 season_3 season_4 workingday_0 workingday_Working Day weathersit_1 weathersit_2 weathersit_3 weathersit_4
dteday
2011-01-01 1 0 1 -1.670004 6 3.28 3.0014 81.0 0.0 16 1 0 0 0 1 0 1 0 0 0
2011-01-01 2 0 1 -1.525374 6 2.34 1.9982 80.0 0.0 40 1 0 0 0 1 0 1 0 0 0
2011-01-01 3 0 1 -1.380744 6 2.34 1.9982 80.0 0.0 32 1 0 0 0 1 0 1 0 0 0
2011-01-01 4 0 1 -1.236115 6 3.28 3.0014 75.0 0.0 13 1 0 0 0 1 0 1 0 0 0
2011-01-01 5 0 1 -1.091485 6 3.28 3.0014 75.0 0.0 1 1 0 0 0 1 0 1 0 0 0
In [231]:
scaled_hour.columns
Out[231]:
Index(['instant', 'yr', 'mnth', 'hr', 'weekday', 'temp', 'atemp', 'hum',
       'windspeed', 'cnt', 'season_1', 'season_2', 'season_3', 'season_4',
       'workingday_0', 'workingday_Working Day', 'weathersit_1',
       'weathersit_2', 'weathersit_3', 'weathersit_4'],
      dtype='object')
In [232]:
scaled_hour.rename(columns={'workingday_Working Day':'workingday_1'}, inplace=True)
In [233]:
# Model With scaled Hours and categorical features
feature_cols = ['hr','season_1', 'season_2', 'season_3','season_4', 'workingday_1',
                'weathersit_1','weathersit_2', 'weathersit_3','temp','atemp','hum','windspeed']
X_scaled_hour = scaled_hour[feature_cols]
y_scaled_hour = scaled_hour.cnt
# create training and testing vars
X_train_Scaled, X_test_Scaled, y_train_Scaled, y_test_Scaled = train_test_split(X_scaled_hour, y_scaled_hour, test_size=0.2)
print (X_train_Scaled.shape, y_train_Scaled.shape)
print (X_test_Scaled.shape, y_test_Scaled.shape)
(13903, 13) (13903,)
(3476, 13) (3476,)
In [234]:
lm_scaled_hour = linear_model.LinearRegression()
model_scaled_hour = lm_scaled_hour.fit(X_train_Scaled, y_train_Scaled)
predictions_scaled_hour = lm_scaled_hour.predict(X_test_Scaled)
In [235]:
print ('Score:', model_scaled_hour.score(X_train_Scaled, y_train_Scaled))
Score: 0.3523183524462943
In [236]:
y_pred_Scaled = model_scaled_hour.predict(X_test_Scaled)
print('RMSE: %.2f' % sqrt(mean_squared_error(y_test_Scaled, y_pred_Scaled)))
RMSE: 145.39
In [237]:
#Hour Normalized model
scaled_Hours_Model_Score =  model_scaled_hour.score(X_train_Scaled, y_train_Scaled)
scaled_Hours_Model_RMSE = sqrt(mean_squared_error(y_test_Scaled, y_pred_Scaled))
In [238]:
#train test split Model score

FIT_Model_Score = model_FIT.score(X_test, y_test)

FIT_Hours_Model_RMSE= sqrt(mean_squared_error(y_test, y_pred))
In [239]:
#Model Without feature engineering 
Model_NO_FEATURE_Score =model_N.score(X_train_N, y_train_N)
y_pred_N = model_N.predict(X_test_N)
Hours_NO_FEATURE_SModel_RMSE =sqrt(mean_squared_error(y_test_N, y_pred_N))
In [240]:
Model_Scores = pd.DataFrame([ ['Hour Normalized model', scaled_Hours_Model_Score,scaled_Hours_Model_RMSE],['Train test split Model', 
                                                                                                          FIT_Model_Score,FIT_Hours_Model_RMSE ],
                             ['Model Without feature engineering',Model_NO_FEATURE_Score,Hours_NO_FEATURE_SModel_RMSE ]], columns=['Model', 'Score', 'RMSE'])
In [241]:
Model_Scores
Out[241]:
Model Score RMSE
0 Hour Normalized model 0.352318 145.385351
1 Train test split Model 0.634436 111.105389
2 Model Without feature engineering 0.189880 160.478719
In [242]:
#Random Forest
In [244]:
from sklearn.model_selection import train_test_split
feature_cols = ['hr_0','hr_1', 'hr_2',
       'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10',
       'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18',
       'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23','season_1', 'season_2', 'season_3',
       'season_4', 'workingday_1', 'weathersit_1','weathersit_2', 'weathersit_3','temp','atemp','hum','windspeed',
       'weathersit_4']
X_R = MOD_READY[feature_cols]
y_R = MOD_READY.cnt
# create training and testing vars


X_train_R, X_test_R, y_train_R, y_test_R = train_test_split(X_R, y_R, test_size=0.2, random_state=0)
In [245]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_R = sc.fit_transform(X_train_R)
X_test_R = sc.transform(X_test_R)
In [246]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train_R, y_train_R)
y_pred_R = regressor.predict(X_test_R)
In [247]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_R, y_pred_R))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_R, y_pred_R))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_R, y_pred_R)))
Mean Absolute Error: 51.949629842731106
Mean Squared Error: 6358.156178620574
Root Mean Squared Error: 79.7380472460956
In [248]:
x_R_P = regressor.predict(X_R)
In [249]:
MOD_READY['cnt_predect_R']=x_R_P
In [257]:
MOD_READY.columns
Out[257]:
Index(['instant', 'yr', 'mnth', 'weekday', 'temp', 'atemp', 'hum', 'windspeed',
       'cnt', 'hr_0', 'hr_1', 'hr_2', 'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7',
       'hr_8', 'hr_9', 'hr_10', 'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15',
       'hr_16', 'hr_17', 'hr_18', 'hr_19', 'hr_20', 'hr_21', 'hr_22', 'hr_23',
       'season_1', 'season_2', 'season_3', 'season_4', 'workingday_0',
       'workingday_1', 'weathersit_1', 'weathersit_2', 'weathersit_3',
       'weathersit_4', 'cnt_predect', 'cnt_predect_R'],
      dtype='object')

lets print the results for the original count, prediction using train test and the prediction using Random Forest in all hours grouped by month(max,min,and mean)

In [258]:
for col in MOD_READY.filter(regex='hr').columns:
    Model_Plot = MOD_READY.pivot_table(index=['mnth',col] , margins=False ,values=['cnt',  'cnt_predect','cnt_predect_R'],aggfunc=np.mean)
    Model_Plot.plot(figsize=(20,8), title="Prediction Vs Actual", grid=True)
    plt.ylabel('Data')
C:\Users\AbdulsalamFawzi\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py:338: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = self.plt.figure(figsize=self.figsize)
In [259]:
for col in MOD_READY.filter(regex='hr').columns:
    Model_Plot = MOD_READY.pivot_table(index=['mnth',col] , margins=False ,values=['cnt',  'cnt_predect','cnt_predect_R'],aggfunc=np.max)
    Model_Plot.plot(figsize=(20,8), title="Prediction Vs Actual", grid=True)
    plt.ylabel('Data')
In [260]:
for col in MOD_READY.filter(regex='hr').columns:
    Model_Plot = MOD_READY.pivot_table(index=['mnth',col] , margins=False ,values=['cnt',  'cnt_predect','cnt_predect_R'],aggfunc=np.min)
    Model_Plot.plot(figsize=(20,8), title="Prediction Vs Actual", grid=True)
    plt.ylabel('Data')
In [ ]: